library(flexdashboard)
library(shinydashboard)
##
## Attaching package: 'shinydashboard'
## The following objects are masked from 'package:flexdashboard':
##
## renderValueBox, valueBox, valueBoxOutput
## The following object is masked from 'package:graphics':
##
## box
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.0
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(stringr)
library(ggiraph)
library(treemap)
library(d3treeR)
user_path <- "/Users/taylor/Documents/OMSBA 5210/DEC/OMSBA-5210-DEC"
game_reviews <- read.csv(paste0(user_path,"/Data/game_reviews.csv"))
global_sales <- read.csv(paste0(user_path,"/Data/global_sales.csv"))
pc_game_sales <- read.csv(paste0(user_path,"/Data/pc_game_sales.csv"))
twitch_game_data <- read.csv(paste0(user_path,"/Data/twitch_game_data.csv"))
twitch_global_data <- read.csv(paste0(user_path,"/Data/twitch_global_data.csv"))
data_set <- merge(x=global_sales,y=game_reviews,by.x="Name",by.y="name")
options(scipen = 999)
game_reviews_temp <- game_reviews %>% mutate(user_review = as.numeric(user_review) * 10) %>%
mutate(platform = str_trim(platform)) %>%
mutate(name = str_trim(name)) %>% drop_na()
global_sales_temp <- global_sales %>% filter(Year >= 2010) %>% filter(Year != "N/A") %>%
mutate(handheld_console =
ifelse(Platform %in% c("DS","3DS","PSP","PSV","GB"),TRUE,FALSE)) %>%
mutate(NA_Sales = NA_Sales * 1000000) %>%
mutate(EU_Sales = EU_Sales * 1000000) %>%
mutate(JP_Sales = JP_Sales * 1000000) %>%
mutate(Other_Sales = Other_Sales * 1000000) %>%
mutate(Global_Sales = Global_Sales * 1000000) %>%
mutate(Platform = case_when(
Platform == "X360" ~ "Xbox 360",
Platform == "PS3" ~ "PlayStation 3",
Platform == "PS4" ~ "PlayStation 4",
Platform == "XOne" ~ "Xbox One",
Platform == "WiiU" ~ "Wii U",
Platform == "PS2" ~ "PlayStation 2",
Platform == "GBA" ~ "Game Boy Advance",
Platform == "PSV" ~ "PlayStation Vita",
Platform == "PS" ~ "PlayStation",
Platform == "XB" ~ "Xbox",
Platform == "GC" ~ "GameCube",
Platform == "GB" ~ "Game Boy",
Platform == "GC" ~ "GameCube",
Platform == "N64" ~ "Nintendo 64",
TRUE ~ Platform
)) %>% mutate(Name = str_trim(Name)) %>% drop_na(c(Year,Platform))
twitch_game_data_temp <- twitch_game_data %>% mutate(Game = str_trim(Game)) %>%
filter(Game != "") %>%
mutate(Hours_watched = as.numeric(str_trim(sub("\\(.*","",Hours_watched)))) %>%
mutate(Hours_Streamed = str_trim(sub("\\(.*","",Hours_Streamed))) %>%
mutate(Peak_viewers = str_trim(sub("\\(.*","",Peak_viewers))) %>%
mutate(Peak_channels = str_trim(sub("\\(.*","",Peak_channels))) %>%
mutate(Streamers = str_trim(sub("\\(.*","",Streamers))) %>%
mutate(Avg_viewers = str_trim(sub("\\(.*","",Avg_viewers))) %>%
mutate(Avg_channels = str_trim(sub("\\(.*","",Avg_channels))) %>%
mutate(Avg_viewer_ratio = str_trim(sub("\\(.*","",Avg_viewer_ratio))) %>%
mutate(Hours_Streamed = as.numeric(gsub("\\D","",Hours_Streamed))) %>%
group_by(Game) %>%
mutate(total_hours_watched = sum(Hours_watched)) %>%
mutate(total_hours_streamed = sum(as.numeric(Hours_Streamed))) %>%
mutate(avg_peak_viewers = mean(as.numeric(Peak_viewers))) %>%
mutate(avg_peak_channels = mean(as.numeric(Peak_channels))) %>%
mutate(avg_streamers = mean(as.numeric(Streamers))) %>%
mutate(Avg_viewers = mean(as.numeric(Avg_viewers))) %>%
mutate(Avg_channels = mean(as.numeric(Avg_channels))) %>%
mutate(Avg_viewer_ratio = mean(as.numeric(Avg_viewer_ratio))) %>%
select(c("Game","total_hours_watched","total_hours_streamed",
"avg_peak_viewers","avg_peak_channels","avg_streamers","Avg_viewers",
"Avg_channels","Avg_viewer_ratio")) %>% unique() %>% ungroup()
data_set <- left_join(x=global_sales_temp,y=game_reviews_temp,by=c("Name"="name","Platform"="platform"))
data_set <- left_join(x=data_set,y=twitch_game_data_temp,by=c("Name"="Game"))
regression_data_set <- data_set %>% select(-c("Platform","Rank")) %>% unique() %>%
drop_na(meta_score,user_review) %>% group_by(Name) %>%
filter(row_number() ==1) %>%
gather("Region","Sales",c("NA_Sales","EU_Sales","JP_Sales","Other_Sales")) %>%
mutate(Region_updated = case_when(
Region == "NA_Sales" ~ "North America",
Region == "EU_Sales" ~ "Europe",
Region == "JP_Sales" ~ "Japan",
Region == "Other_Sales" ~ "Other",
TRUE ~ Region)) %>%
select(Name,Year,Genre,Region,Region_updated,Sales,meta_score,user_review) %>% unique() %>%
mutate(meta_tooltip = paste("Game:",Name,"\nYear:",Year,"\nRating:",meta_score,"\nRegion:",Region_updated,"\nSales:",Sales)) %>%
mutate(user_tooltip = paste("Game:",Name,"\nYear:",Year,"\nRating:",user_review,"\nRegion:",Region_updated,"\nSales:",Sales))
df <- data_set %>% gather("Region","Sales",c("NA_Sales","EU_Sales","JP_Sales","Other_Sales")) %>%
mutate(Region_updated = case_when(
Region == "NA_Sales" ~ "North America",
Region == "EU_Sales" ~ "Europe",
Region == "JP_Sales" ~ "Japan",
Region == "Other_Sales" ~ "Other",
TRUE ~ Region)) %>%
group_by(Name,Platform) %>% mutate(total_sales = sum(Sales))
top_100 <- df %>% select(Name,Platform,total_sales,Genre) %>% unique() %>% arrange(desc(total_sales)) %>% head(n=100) %>%
ungroup() %>%
mutate(rank = row_number()) %>% select(-c("total_sales"))
df1 <- df %>% filter(Name %in% top_100$Name) %>% left_join(top_100,by=c("Name","Platform","Genre")) %>% subset(rank %in% c(1.0:100.0))
df2 <- df %>%
select(Name, total_hours_watched, Sales,Region, avg_peak_channels, Genre) %>%
group_by(Name) %>%
mutate(all_sales = sum(Sales)) %>%
drop_na()
## Adding missing grouping variables: `Platform`
df2<- df2 %>%
select(Name, total_hours_watched, avg_peak_channels, all_sales, Genre,Region) %>%
unique()
df3 <- df %>% filter(Name %in% top_100$Name) %>% left_join(top_100,by=c("Name","Platform","Genre")) %>% subset(rank %in% c(1.0:100.0))
In order to tie our project in with a story, we chose the scenario that we are a new gaming development company that is trying to successfully break into the gaming industry. In order to do so, we are seeking insights to help inspire our first release.
The key objectives that we want to gain from this study are:
For this project we found two data sets from Kaggle courtesy of Ibraihim Naeem and Ran Kris. The Video Game Sales data set includes data on essential video game information such as genre, publisher, sales in different regions, and user ratings. The second data set, Top Games on Twitch 2016 - 2021 contains streaming data across various video games, such as average channels, total hours watched, and total average viewers.
Luckily, the data sets that we found were already pretty tidy, and our main challenge with it was combining the datasets together.
Twitch is a popular video game-focused live streaming platform where users can broadcast themselves playing games and interact with viewers. However, users are not limited to just video games. Some popular alternatives include music, art, and ASMR.
Meta critic scores are determined by taking the weighted average of scores made by a large group of respected video game critics. Games with a score of 90+ are considered a “must play” game. This is very different from user rating, in which anyone can write a review and the rating is taken from the average.
data_set2 <- data_set %>% gather("Region","Sales",c("NA_Sales","EU_Sales","JP_Sales","Other_Sales")) %>%
mutate(Region_updated = case_when(
Region == "NA_Sales" ~ "North America",
Region == "EU_Sales" ~ "Europe",
Region == "JP_Sales" ~ "Japan",
Region == "Other_Sales" ~ "Other",
TRUE ~ Region)) %>% group_by(Publisher,Genre) %>%
mutate(pub_sales = sum(Sales)) %>%
mutate(tooltip = paste0('Genre: ',Genre, '\n', 'Total Sales: ', pub_sales))
top_pub <- data_set2 %>% group_by(Publisher) %>% mutate(top_sales = sum(Sales)) %>%
select(Publisher,top_sales) %>% unique() %>% arrange(desc(top_sales)) %>%
head(n=5) %>% select(Publisher)
bar_data_set <- data_set2 %>% filter(Publisher %in% top_pub$Publisher)
bar_plot <- ggplot(bar_data_set,aes(fill= Genre,x=Sales/1000000,y=Publisher,tooltip = tooltip)) +
geom_bar_interactive(position="stack",stat="identity") + labs(x = 'Sales in Millions', y = 'Publishers', title = 'Top Video Game Genres by Publishers') + scale_color_brewer(palette="Set3")
ggiraph(ggobj = bar_plot)
The first visual we made was a bar chart that shows what publishers are the most successful in each region as well as tracking their overall global success. One thing we can take away from this visual is that Activision and Electronic Arts are the most successful publishers. The interesting thing about this graph is you can see how each genre is distributed with each publisher. So for example, a majority of the focus for Activision is in their shooting genre while Electronic Arts focus on action games. A major takeaway for us as we try to enter the gaming industry is to consider allowing these two major publishers to publish our game. Lastly, choosing a less popular genre might also be an idea for us to consider because the market might potentially be over saturated with shooter games and action games.
tree_data_set <- data_set %>% select(-c("Platform","Rank")) %>% unique() %>%
gather("Region","Sales",c("NA_Sales","EU_Sales","JP_Sales","Other_Sales")) %>%
group_by(Name,Region) %>%
select(Name,Genre,Region,Sales) %>% mutate(Region_updated= Region) %>% unique() %>%
mutate(Region_updated = case_when(
Region == "NA_Sales" ~ "North America",
Region == "EU_Sales" ~ "Europe",
Region == "JP_Sales" ~ "Japan",
Region == "Other_Sales" ~ "Other",
TRUE ~ Region))
d3tree2(treemap(tree_data_set,index=c("Region_updated","Genre"),vSize="Sales",title= "Sales by Region and Genre"),rootname="Sales by Region and Genre")
The treemap visual gives us insight as to what countries have the most video game sales. This will allow us to choose a target market or a target audience to try and appeal to. The US and Europe have the most sales, and the two most purchased gaming genres are shooters and action games in the US, while action and sports games are more popular in Europe. Japan on the other hand seems to enjoy role-playing games, and those types of games are the most popular genre in the country. This treemap will allow us to potentially tailor our game to certain regions in order to garner more attention and popularity.
point_df <- df1 %>% group_by(Name,Platform) %>% mutate(all_sales = sum(Sales)) %>% ungroup() %>%
mutate(tooltip = paste0(tooltip = paste0('Title: ', stringr::str_to_title(Name),
'\nRank: ', rank,
'\nSales: ', all_sales,
'\nPlatform: ', Platform))) %>% select(rank,Platform,Name,all_sales,tooltip) %>% unique()
viz1 <- ggplot(point_df, mapping=aes(x = rank, y = all_sales/1000000, color = Platform, tooltip = tooltip)) +
geom_point_interactive(size = 6, stat = "identity") +
scale_x_reverse() +
scale_color_brewer(palette = 'Set3') +
theme_minimal() +
labs(x = 'Rank', y = 'Sales in Millions', title = 'Sales of Top 100 Video Games') +
theme(plot.title =element_text(hjust = .5, size = 18), text = element_text(size = 12))
ggiraph(ggobj = viz1)
The purpose of this visual is to see how the rank of a game affects the amount of sales it achieves. The results show us that there is also a notable difference in sales numbers. It is also interesting to see how the number of sales start to steeply project upward the moment a game enters the top 50 threshold. It is also really cool to see how our nice and smooth line becomes a bit more scattered as we filter the graph by each reason.
From this graph we can also identify what are the most popular platforms by the region. In North America, the Xbox appears to be the most dominant platform, while in Europe it’s the Playstation, and in Japan is the 3DS and its successor the 3DS. It’s interesting that there are no two regions that have the same platform of choice, and all of them are made by separate companies.
On another note, Kinect Adventures is a standout outlier on this graph, which may be due to being part of bundle package deal with Xbox360 purchases. Considering its success, we should consider a game console bundle pack with our game.
histo_df <- df3 %>%
group_by(Name,Region,Genre) %>% mutate(all_sales = sum(Sales)) %>% ungroup()
viz3 <- histo_df %>% ggplot(aes(x = all_sales/1000000)) +
geom_histogram(binwidth = 1, fill = "#A3E4D7", color = "#A3E4D7") +
labs(x = 'Total Sales(Millions)', y = 'Count', title = 'Histogram of Sales of Top 100 Games') +
theme(plot.title =element_text(hjust = .5, size = 18), text = element_text(size = 12))
ggiraph(ggobj = viz3)
This histogram of the sales of the top 100 games appear to have a strong right tail distribution, meaning that most video games make it within the range of 0-5 million sales, which is rather low to us. The histogram also gives us an idea on what is the reasonable expectation on how the sales for our game will play out.
This also raises the question on how we should define success. Is it if we reach more than 5 million in sales? Or is it if we meet about the same amount as everyone else does, which is less than 2.5 million sales?
bubble_df <- df2 %>% select(-c("Region")) %>%
mutate(tooltip = paste0(tooltip = paste0('Title: ', stringr::str_to_title(Name),
'\nGenre: ',Genre,
'\nHours Watched On Twitch (Millions): ', total_hours_watched/1000000,
'\nSales (Millions): ', all_sales/1000000,
'\nAverage Peak Channels: ', avg_peak_channels))) %>% unique()
viz2 <- ggplot(bubble_df, mapping = aes(x = total_hours_watched/1000000, y=all_sales/1000000, size = avg_peak_channels, color = Genre, tooltip = tooltip)) +
geom_point_interactive(alpha = .7) +
scale_size(range = c(1.4, 20), name="Average Peak Channels") +
labs(x = 'Hours Watched On Twitch in Millions', y = 'Sales in Millions', title = 'Hours Watched on Twitch to Sales') +
scale_color_brewer(palette = 'Set3') +
theme_minimal() +
theme(plot.title = element_text(hjust = .5, size = 18), text = element_text(size = 12))
ggiraph(ggobj = viz2)
A key objective of ours was to determine if a Twitch marketing campaign is essential to gaining more sales. To our surprise, this visual shows us there is not a notable correlation between hours watched on twitch to sales. The amount of people streaming the game (the average number of channels) also does not have a strong effect on sales.
An important note to bear in mind is that before we merged the data all together, our dataset showed that League of Legends, which is a free to play game, was the most watched game on Twitch. However due to not having information on revenues for free to play games, we were unable to include them. These games typically gain income from selling battle passes, skins, expansion packs, or esports. It may be unnecessary to push a Twitch marketing campaign, but it might be beneficial to us to look into free to play games in the future.
mscore_data_set <- regression_data_set
mscore <- ggplot(mscore_data_set,mapping= aes(x=meta_score,y=Sales/1000000)) + geom_point_interactive(aes(tooltip=meta_tooltip)) +
geom_smooth() + scale_color_brewer(palette="Set3") +
labs(x="Metacritic Rating (1-100)",y= "Sales in Millions",title= "Metacritic Rating Regression on Global Sales") +
theme(plot.title = element_text(hjust = .5, size = 18), text = element_text(size = 12))
ggiraph(ggobj=mscore)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
This regression shows us that in the North American region metacritic scores seem to have a larger impact on sales than Europe and Japan. Sales dramatically skyrocket after a video game attains a metacritic score of about 90. Since metacritic scores and reviews are posted in English, language barriers may be a possible factor into having less impact outside of North America.
mscore_histo <- regression_data_set
viz4 <- mscore_histo %>% ggplot(aes(x = meta_score)) +
geom_histogram(binwidth = 1, fill = "#A3E4D7", color = "#A3E4D7") +
labs(x = 'Metacritic Score', y = 'Count', title = 'Histogram of Metacritic Scores') +
theme(plot.title =element_text(hjust = .5, size = 18), text = element_text(size = 12))
ggiraph(ggobj = viz4)
This histogram shows us that the score that has the most occurrences is around 80, which is a lot higher than we expected. We expected that critics to be a little more unyielding when it comes to the score they give.
uscore_data_set <- regression_data_set
uscore <- ggplot(uscore_data_set,mapping= aes(x=user_review,y=Sales/1000000)) + geom_point_interactive(aes(tooltip=user_tooltip)) + geom_smooth() + scale_color_brewer(palette="Set3") + scale_y_continuous(labels= scales::comma) +
labs(x= "User Rating (1-100)",y= "Sales in Millions",title= "User Rating Regression on Global Sales") +
theme(plot.title = element_text(hjust = .5, size = 18), text = element_text(size = 12))
ggiraph(ggobj=uscore)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
What we learn from this regression is that there does not appear to be a strong correlation between user ratings to sales across all regions. Although we do acknowledge that Japan may possibly have a slightly stronger positive correlation compared to the other regions.
uscore_histo <- regression_data_set
viz5 <- uscore_histo %>% ggplot(aes(x = user_review)) +
geom_histogram(binwidth = 1, fill = "#A3E4D7", color = "#A3E4D7") +
labs(x = 'User Score', y = 'Count', title = 'Histogram of User Score') +
theme(plot.title =element_text(hjust = .5, size = 18), text = element_text(size = 12))
ggiraph(ggobj = viz5)
User ratings seem to align with metacritic ratings, but with a slightly stronger left tail. Users seem to be more critical on game reviews than critics, but we need to take into consideration that user review scores are made from a potentially larger sample size, while critic scores are generated from a smaller group that are hand selected to review the games.
Activision or Electronic Arts are our preferred publishing companies.
It might be best to not choose a popular genre (ie. action and shooters) due to over saturation of the market. Simulation or role play games are a probable option.
A Twitch marketing campaign may not be necessary at the moment, but making our game free to play is an option to look into.
Attaining a Metacritic score in the range of 80-85 is a good goal to boost North American and European sales, but as always we want to achieve the highest score possible.
User scores seem to have more of an impact in Japan, so we need to put that into consideration to make the game player-friendly.